# if (!require("BiocManager", quietly = TRUE))
# install.packages("BiocManager")
# BiocManager::install("oligo")
# BiocManager::install("limma")
# BiocManager::install("biomaRt")
#package for reading CEL files, will add more packages for further steps
csv_file <- read.table(file = r"(C:\Users\linde\rmarkdown\affy_processing\data_runsheets\GLDS_6_runsheet.csv)",
sep=",",
header = TRUE,
check.names = FALSE
)
array_data_files <- csv_file["Path to Raw Data File"] #column name of paths
raw_data <- oligo::read.celfiles(array_data_files) #ExpressionFeatureSet Class
## Platform design info loaded.
## Reading in : C:\Users\linde\rmarkdown\input_data\GLDS6\GLDS-6_array_Rnor_SDR_keratinocytes_CTRL_Rep1_GSM144733_raw1.CEL.gz
## Reading in : C:\Users\linde\rmarkdown\input_data\GLDS6\GLDS-6_array_Rnor_SDR_keratinocytes_CTRL_Rep2_GSM144734_raw.CEL.gz
## Reading in : C:\Users\linde\rmarkdown\input_data\GLDS6\GLDS-6_array_Rnor_SDR_keratinocytes_CTRL_Rep3_GSM144735_raw.CEL.gz
## Reading in : C:\Users\linde\rmarkdown\input_data\GLDS6\GLDS-6_array_Rnor_SDR_keratinocytes_IR_56Fe_Rep1_GSM144736_raw.CEL.gz
## Reading in : C:\Users\linde\rmarkdown\input_data\GLDS6\GLDS-6_array_Rnor_SDR_keratinocytes_IR_56Fe_Rep2_GSM144737_raw.CEL.gz
## Reading in : C:\Users\linde\rmarkdown\input_data\GLDS6\GLDS-6_array_Rnor_SDR_keratinocytes_IR_56Fe_Rep3_GSM144738_raw.CEL.gz
print("Verify Data step")
## [1] "Verify Data step"
density_plot <- hist(raw_data, transfo=log2, which=c("pm", "mm", "bg", "both", "all"),
nsample=10000, target = "core", main = "Density Plot")
## TODO: Add sample legend
#legend(15, 0.35, legend=c(colnames(raw_table)), lty=1:2, cex = 1)
# legend() is not compatable
# transfo : used to scale the data
# which : defines specific probe types
# nsample : sample size used to produce plot
# target : specifies group of meta-probeset
# main : main title
pseudoimage <- image(raw_data, transfo=log2)
MA_plot <- MAplot(raw_data)
## Warning in KernSmooth::bkde2D(x, bandwidth = bandwidth, gridsize = nbin, :
## Binning grid too coarse for current (small) bandwidth: consider increasing
## 'gridsize'
## Warning in KernSmooth::bkde2D(x, bandwidth = bandwidth, gridsize = nbin, :
## Binning grid too coarse for current (small) bandwidth: consider increasing
## 'gridsize'
### QA: Statistics
# BiocManager::install("AnnotationDbi")
# BiocManager::install("Biobase")
# The IQRray statistic is obtained by ranking all the probe intensities from
# a given array and by computing the average rank for each probe set. The
# interquartile range (IQR) of the probe sets average ranks serves then as
# quality score.
#Example of a low score: 122783.6
# high score: 226124.6
library(methods)
library(AnnotationDbi)
library(Biobase)
#data - Affybatch object obtained after reading in celfiles into R with function ReadAffy from package affy
#obtaining intensity values for perfect match (pm) probes
pm_data<-pm(raw_data)
#ranking probe intensities for every array
rank_data<-apply(pm_data,2,rank)
#obtaining names of probeset for every probe
probeNames<-probeNames(raw_data)
#function computing IQR of mean probe ranks in probesets
get_IQR<-function(rank_data,probeNames){round(IQR(sapply(split(rank_data,probeNames),mean)),digits=2)}
#computing arIQR score
IQRray_score<-apply(rank_data,2,get_IQR,probeNames=probeNames)
raw_table <- exprs(raw_data)
# RMA is most commonly used method for preprocessing normalization of Affymetrix microarray data. There
# are three steps in the RMA algorithm: Convolution Background Correction, Quantile Normalization, and
# Summarization using Median Polish.
normRMA <- rma(raw_data)
## Background correcting
## Normalizing
## Calculating Expression
norm_probeset_table <- exprs(normRMA)
norm_probeset_table <- as.data.frame(norm_probeset_table)
#Human Exon array probeset to gene-level expression (biostars.org, https://www.biostars.org/p/271379/)
#Probe Level normalization
backgroundCorrection <- preprocessCore::rma.background.correct(raw_table)
probelevel_table <- preprocessCore::normalize.quantiles(backgroundCorrection, keep.names=TRUE)
MAplot_norm <- MAplot(normRMA)
densityPlot_norm <- hist(normRMA)
## PCA plot
pca <- prcomp(t(norm_probeset_table))
plot(pca$x[,1:2])
## TODO: make sample legend
#legend('topright', col=1:6, legend = paste("Samples", 1:6), pch=20, bty='n', cex=.75)
## Getting Factor Values
fv <- data.frame(csv_file["Factor Value"])
all_factor_values <- fv[,1]
factor_values <- unique(all_factor_values)
all.factor.values <- make.names(all_factor_values)
factor.values <- make.names(factor_values)
## Creating Design
ph = raw_data@phenoData
ph@data[ ,2] = c(all.factor.values)
colnames(ph@data)[2]="source"
groups = ph@data$source
f = factor(groups,levels = factor.values)
design = model.matrix(~ 0 + f)
colnames(design) <- factor.values
## Creating Constrast Matrix
fit = limma::lmFit(norm_probeset_table,design)
fit.groups <- colnames(fit$design) [which(fit$assign == 1)]
fit.index <- which(levels(levels) %in% fit.groups)
fit.group.names <- gsub(" ", "_", sub(",", "_", unique(csv_file$'Factor Value')))
combos <- combn(fit.groups, 2)
combos.names <- combn(fit.group.names, 2)
contrasts <- c(paste(combos[1,], combos[2,], sep = "-", paste(combos[2,], combos[1,], sep="-")))
contrast.names <- c(paste(combos.names[1,], combos.names[2,], sep = "v"), paste(combos.names[2,],
combos.names[1,], sep = "v"))
cont.matrix <- limma::makeContrasts(contrasts = contrasts, levels=design)
contrast.fit <- limma::contrasts.fit(fit, cont.matrix)
## Testing and Accessing Results
contrast.fit.eb <- limma::eBayes(contrast.fit)
log_fold_change <- contrast.fit.eb$coefficients
log.fold.change <- c(log_fold_change) #so name is better in final dataframe
p_value <- contrast.fit.eb$p.value
p.value <- c(p_value)
adjusted_pvalue <- p.adjust(p_value,method="BH")
##TODO debug decideTests and summary functions
#results given at probeset level
#difficult to put into dataframe for more than one
#dge_dataframe <- cbind(norm_probeset_table, log.fold.change, p.value, adjusted_pvalue)
# for glds 6
# TODO: figure out how to separate factor values not manually
non_irradiated <- norm_probeset_table[,c(1,2,3)]
ion_beam_radiation <- norm_probeset_table[,c(4,5,6)]
avg_fv_weird_names_6 <- data.frame(rowMeans(non_irradiated), rowMeans(ion_beam_radiation))
#avg_fv_6 <- rename(avg_fv_weird_names_6, "Mean Non Irradiated" = "rowMeans.non_irradiated.", "Mean 56Fe Ion Beam Radiation" = "rowMeans.ion_beam_radiation.")
dge_dataframe <- cbind(norm_probeset_table, log.fold.change, p.value, adjusted_pvalue, avg_fv_weird_names_6)
#write.csv(dge_dataframe, r"(C:\Users\linde\rmarkdown\notebook\glds_6_DGE.csv)")
# only run once
## TODO: make dataframe for glds 25 and glds 189
library(biomaRt)
library(dplyr)
library(tibble)
## GLDS 25
listEnsembl()
## biomart version
## 1 genes Ensembl Genes 107
## 2 mouse_strains Mouse strains 107
## 3 snps Ensembl Variation 107
## 4 regulation Ensembl Regulation 107
ensembl <- useEnsembl(biomart = "genes")
searchDatasets(mart = ensembl, pattern = "Mouse")
## dataset description version
## 107 mmurinus_gene_ensembl Mouse Lemur genes (Mmur_3.0) Mmur_3.0
## 108 mmusculus_gene_ensembl Mouse genes (GRCm39) GRCm39
ensembl <- useDataset(dataset = "mmusculus_gene_ensembl", mart = ensembl)
searchAttributes(mart = ensembl, pattern = "affy_")
## name description page
## 95 affy_mg_u74a AFFY MG U74A probe feature_page
## 96 affy_mg_u74av2 AFFY MG U74Av2 probe feature_page
## 97 affy_mg_u74b AFFY MG U74B probe feature_page
## 98 affy_mg_u74bv2 AFFY MG U74Bv2 probe feature_page
## 99 affy_mg_u74c AFFY MG U74C probe feature_page
## 100 affy_mg_u74cv2 AFFY MG U74Cv2 probe feature_page
## 101 affy_moe430a AFFY MOE430A probe feature_page
## 102 affy_moe430b AFFY MOE430B probe feature_page
## 103 affy_moex_1_0_st_v1 AFFY MoEx 1 0 st v1 probe feature_page
## 104 affy_mogene_1_0_st_v1 AFFY MoGene 1 0 st v1 probe feature_page
## 105 affy_mogene_2_1_st_v1 AFFY MoGene 2 1 st v1 probe feature_page
## 106 affy_mouse430a_2 AFFY Mouse430A 2 probe feature_page
## 107 affy_mouse430_2 AFFY Mouse430 2 probe feature_page
## 108 affy_mu11ksuba AFFY Mu11KsubA probe feature_page
## 109 affy_mu11ksubb AFFY Mu11KsubB probe feature_page
my_affy_ids <- c(head(rownames(norm_probeset_table), 2000))
gene_annotations <- getBM(
attributes = c(
"affy_mogene_1_0_st_v1",
"ensembl_gene_id",
"uniprot_gn_symbol",
"go_id"
), # Columns we want, these were found using searchAttributes
filters = "affy_mogene_1_0_st_v1", # Filters (one or more) that should be used in the query
values = my_affy_ids, # Query IDs
mart = ensembl)
## GLDS 6
listEnsembl()
## biomart version
## 1 genes Ensembl Genes 107
## 2 mouse_strains Mouse strains 107
## 3 snps Ensembl Variation 107
## 4 regulation Ensembl Regulation 107
ensembl <- useEnsembl(biomart = "genes")
searchDatasets(mart = ensembl, pattern = "Rat")
## dataset description version
## 167 rnorvegicus_gene_ensembl Rat genes (mRatBN7.2) mRatBN7.2
ensembl <- useDataset(dataset = "rnorvegicus_gene_ensembl", mart = ensembl)
searchAttributes(mart = ensembl, pattern = "affy_")
## name description page
## 91 affy_rae230a AFFY RAE230A probe feature_page
## 92 affy_rae230b AFFY RAE230B probe feature_page
## 93 affy_raex_1_0_st_v1 AFFY RaEx 1 0 st v1 probe feature_page
## 94 affy_ragene_1_0_st_v1 AFFY RaGene 1 0 st v1 probe feature_page
## 95 affy_ragene_2_1_st_v1 AFFY RaGene 2 1 st v1 probe feature_page
## 96 affy_rat230_2 AFFY Rat230 2 probe feature_page
## 97 affy_rg_u34a AFFY RG U34A probe feature_page
## 98 affy_rg_u34b AFFY RG U34B probe feature_page
## 99 affy_rg_u34c AFFY RG U34C probe feature_page
## 100 affy_rn_u34 AFFY RN U34 probe feature_page
## 101 affy_rt_u34 AFFY RT U34 probe feature_page
#my_affy_ids <- c(head(rownames(norm_probeset_table), 5000))
# example to test
my_affy_ids <- rownames(norm_probeset_table)
# all probeset names but currently takes too long, ~15 minutes to run
gene_annotations <- getBM(
attributes = c(
"affy_rat230_2",
"ensembl_gene_id",
"uniprot_gn_symbol",
"go_id"
),
filters = "affy_rat230_2",
values = my_affy_ids,
mart = ensembl)
gene_annotations <- gene_annotations %>%
group_by(affy_rat230_2) %>% # group by ensembl_gene_id
# then for each group
summarise(
# Convert uniprot_gn_symbol to a string of unique values in the group (i.e. all symbols mapped to the same ensembl ID) and rename as "SYMBOL"
SYMBOL = toString(unique(uniprot_gn_symbol)),
# Convert probe IDs to a string of unique IDs in the group (i.e. all probe IDs mapped to the same ensembl ID)
ENSEMBL = toString(unique(ensembl_gene_id)),
# Convert go_ids to a string of unique IDs in the group (i.e. all go_ids mapped to the same ensembl ID) and rename as "GO_Ids"
GO_Ids = toString(unique(go_id))
)
final_dataframe <- rownames_to_column(dge_dataframe, var="probesets") %>% left_join(gene_annotations,by = c("probesets" = "affy_rat230_2"))
#write.csv(final_dataframe, r"(C:\Users\linde\rmarkdown\notebook\GLDS_6_DGE.csv)")
#results <- limma::decideTests(contrast.fit.eb, method = "separate", adjust.method = "BH", p.value = 0.05, lfc = 0.5)
#summary(limma::decideTests(contrast.fit.eb[,-1]))
#Error in h(simpleError(msg, call)) :
# error in evaluating the argument 'object' in selecting a method for function 'summary':
# subscript out of bounds
# for glds 25, having difficulty
#avg_fv_weird_names_25 <- data.frame(rowMeans(norm_probeset_table[,c(1,2,3,4,5)]), rowMeans(norm_probeset_table[,c(6,7,8,9,10,11)]),
# rowMeans(norm_probeset_table[,c(12,13,14,15,16,17)]))
#avg_fv_25 <- rename(avg_fv_weird_names_25, )
#dge_dataframe <- cbind(norm_probeset_table, log_fold_change, p_value, adjusted_pvalue)
#write.csv(dge_dataframe, r"(C:\Users\linde\rmarkdown\notebook\glds_25_DGE.csv)")
# didn't work, everything currently written to dge 25 csv is all glds 6 stuff
#testresults_25 <- data.frame(log_fold_change, p_value, adjusted_pvalue)
# log_fold_change_df <- data.frame(log_fold_change)
# p_value_df <- data.frame(p_value)
# pvalue_gcsf <- c(p_value_df[,1])
# pvalue_gcvc <- c(p_value_df[,2])
# pvalue_sfvc <- c(p_value_df[,3])
# adjusted_pvalue_gcsf <- p.adjust(p_value_df[,1],method="BH")
# adjusted_pvalue_gcvc <- p.adjust(p_value_df[,2],method="BH")
# adjusted_pvalue_sfvc <- p.adjust(p_value_df[,3],method="BH")
# dge_dataframe <- cbind(norm_probeset_table, log_fold_change_df, p_value_df, adjusted_pvalue_gcsf,
# adjusted_pvalue_gcvc, adjusted_pvalue_sfvc)
# #trying to make a contrast matrix
# comparisons <- c()
# for (i in range(length(factor_values)))
# {
# comparisons <- append(comparisons, fv[i]'-'fv[i+1])
# }
# #attempt 2
# counter <- 2
# for (fv in factor_values)
# {
# comparisons <- append(comparisons, fv'-'fv[counter])
# counter <- counter + 1
# }
# #gene mapping with adf
# adf <- read.delim(file=r"(C:\Users\linde\OneDrive\Desktop\GLDS-6_metadata_A-AFFY-43.adf.txt)", header=FALSE, skip=74)
# probeset_names <- c(adf[1])
# ensembl_id_with_emptys <- adf[12]
# refseq <- c(adf[2])
# ensembl_id <- lapply(ensembl_id_with_emptys, function(x) x[x!=""])
# probeset_ensembl_refseq <- data.frame(probeset_names, ensembl_id_with_emptys, refseq)
# ens <- c(probeset_ensembl_refseq[,2])
# probeset <- c(probeset_ensembl_refseq[,1])
# rs <- c(probeset_ensembl_refseq[,3])
# prfromnorm <- rownames(norm_probeset_table)
# #probeset_and_ensembl <- data.frame(probeset_names, ensembl_id)
# c <- 1
# probeset_refseq_dict <- c()
# for (i in probeset)
# {
# probeset_refseq_dict[rs[c]]<-i
# c <- c + 1
# }
# rsdupsbool <- duplicated(rs)
# dupsindex<-c()
# counter <- 1
# for (bool in rsdups)
# {
# if (bool=="TRUE")
# {
# dupsindex <- append(dupsindex, counter)
# }
# counter <- counter + 1
# }
# duprefseq <- c()
# for (index in dupsindex)
# {
# duprefseq <- append(duprefseq, rs[index])
# }
# #rsdups <- duplicated(rs)
# uniquers <- unique(rs)
# probeset_refseq_dict <- c()
# #probeset_with_uniquerefseq[uniquers[probeset_with_refseq[ps]]]<-ps
# norm <- normalize(raw_data)
# all_uniprot <- gene_annotations$uniprot_gn_symbol
# go_ids <- gene_annotations$go_id
# all_probesets <- gene_annotations$affy_rat230_2
# all_ensembl_ids <- gene_annotations$ensembl_gene_id #len: 114871
# ensembl_ids <- unique(all_ensembl_ids) #len: 4700
# probesets <- unique(all_probesets) #len: 4624
# uniprot <- unique(all_uniprot) #len: 3281
# #trying to get gene annotations, but cant use all probesets/ensembl ids because too large
# dgedf_exampleset <- dge_dataframe[udf$all_probesets, ]
# dgedf_exampleset_with_ensembl <- cbind(dgedf_exampleset, udf$all_ensembl_ids)
# #ensembl_to_probeset <- data.frame(all_probesets, row.names = all_ensembl_ids)
# dic <- c()
# counter <- 1
# for (i in ensembl_ids)
# {
# dic[i] <- all_probesets[counter]
# counter <- counter + 1
# }
# #does not work
# # d <- unique(dic)
# # e <- c(d)
# df <- data.frame(all_ensembl_ids, all_probesets)
# udf <- unique(df) #len: 5079
# ens <- udf$all_ensembl_ids
# gene_annotations_PandE <- getBM(
# attributes = c(
# "affy_rat230_2",
# "ensembl_gene_id"
# ),
# filters = "affy_rat230_2",
# values = my_affy_ids,
# mart = ensembl)
# #len: 5079
# "
# affy_rat230_2 ensembl_gene_id uniprot_gn_symbol go_id
# 1 1370109_s_at ENSRNOG00000029996 GO:0005525
# 2 1370109_s_at ENSRNOG00000029996 GO:0003924
# 3 1370109_s_at ENSRNOG00000029996 GO:0003746
# 4 1370109_s_at ENSRNOG00000029996 GO:0006414
# 5 1370109_s_at ENSRNOG00000029996 GO:0000166
# 6 1370109_s_at ENSRNOG00000029996 GO:0006412
# unique df
# all_ensembl_ids all_probesets
# 1 ENSRNOG00000029996 1370109_s_at
# 7 ENSRNOG00000016356 1368272_at
# 27 ENSRNOG00000009377 1372399_at
# 41 ENSRNOG00000027204 1368345_at
# 50 ENSRNOG00000027204 1371087_a_at
# 59 ENSRNOG00000021051 1367727_at
# gene annotations without go id or uniprot
# affy_rat230_2 ensembl_gene_id
# 1 1370109_s_at ENSRNOG00000029996
# 2 1368272_at ENSRNOG00000016356
# 3 1372399_at ENSRNOG00000009377
# 4 1368345_at ENSRNOG00000027204
# 5 1371087_a_at ENSRNOG00000027204
# 6 1367727_at ENSRNOG00000021051
# udf and new gene annotations are basically same thing
# "
# #make a data frame where rownames are the probeset names
# #make one for each sample for expressions
# # have to make intermediate data frame with probeset matching to ensembl to get expression values
# # matched with ensembl
# #data frame with:
# # probeset names
# # ensembl ids
# # expression values
# #expressions are in normRMA/norm_probeset_table
# #to find all probesets that go with one ensembl id:
# # make list of ensembl ids from gene annotations
# # find unique
# # make df where ensembl ids are rownames
#is there a function to split alike objects in a list?
# how I would do it in python:
# lofl = []
# for i in range(len(factor_values)):
# nl = []
# for f in all_factor_values:
# if f == factor_values[i]:
# nl.append(f)
# lofl.append(nl)
# nl = []
# firstfv = lofl[0]
# secondfv = lofl[1]